import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
In this article, we compare a number of classificaiton method for the breast cancer dataset. The details regarding this dataset can be found in Diagnostic Wisconsin Breast Cancer Database.

In this article, we compare a number of classification methods for the breast cancer dataset. We would use the following classification methods and then compare them in terms of performance.
Throughout this website, there are a large number of methods that discuss these methods. Here, we will not discuss these methods and only apply them. Interested readers are encouraged to see Statistical Learning.
data = load_breast_cancer()
df = pd.DataFrame(data['data'], columns = data['feature_names'])
Temp = [x.title() for x in data['target_names'].tolist()]
df['Target'] = data['target']
df['Diagnosis'] = df['Target'].map(lambda x: Temp[1] if x == 1 else Temp[0])
del Temp
df
As can be seen, the number of instances is 569 and the number of attributes is 32. The object of the exercise is to create a classification model that can classify the type of Diagnosis base on the rest of the attributes. However, first, let's plot a count plot for Diagnosis attribute.
Temp = df.groupby(['Diagnosis'])['Diagnosis'].agg({'count'}).reset_index(drop = False).rename(columns ={'count': 'Count'})
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
# display(Temp.style.hide_index())
fig = px.bar(Temp, y= 'Diagnosis', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Diagnosis Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
# Neighbors List
n_neighbors_list = list(np.arange(1,11,1))
# Transforming X into a (weighted) graph of k nearest neighbors
graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode='distance')
# KNeighborsClassifier
classifier_model = KNeighborsClassifier(metric='precomputed')
# Making a pipline
full_model = Pipeline(steps=[('graph', graph_model), ('classifier', classifier_model)])
# Paramter Grid
param_grid = {'classifier__n_neighbors': n_neighbors_list}
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 6.5))
# Left
_ = ax[0].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='n_neighbors', title='Classification accuracy')
# Right
_ = ax[1].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
del graph_model, classifier_model, full_model, param_grid, ax
See sklearn.linear_model.LogisticRegression for more details.
# regularization strength
Regularization_Strength = [10.0**x for x in range(4)]
# Inverse of regularization strength
C = [1/x for x in Regularization_Strength]
# Parameters
param_grid = {'tol': [10.0**x for x in np.arange(-2, -5, -1)], 'C': C,}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, logistic
param_grid = {'pca__n_components': [2, 5, 10, 15, 25, 30], 'logistic__C': np.logspace(-4, 4, 4),}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000, tol=0.1)
# Principle Component Analysis
pca = PCA()
# Making a pipline
full_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 10))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, full_model, grid_model, PCA, logistic
See sklearn.tree.DecisionTreeClassifier for more details.
# Parameters
param_grid = {'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)}
# Logistic Regression
dtc = DecisionTreeClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(dtc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
# Parameters
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# Support Vector Machine
svm = SVC(kernel='rbf', class_weight='balanced')
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(svm, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, svm
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. See sklearn.ensemble.RandomForestClassifier for more details.
# Parameters
param_grid = {'n_estimators': [n*100 for n in [2**m for m in np.arange(0,2)]],
'max_depth': list(np.arange(2,4)),
'min_samples_leaf': [10.0**x for x in np.arange(-1,-4,-1)]},
# Random Forest Classifier
rfc = RandomForestClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(rfc, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, rfc
Gradient Boosting Classifier builds the model in a stage-wise fashion and it generalizes them by allowing optimization of an arbitrary differentiable loss function [Source]. See sklearn.ensemble.GradientBoostingClassifier for more details.
# Parameters
param_grid = {'loss': ['deviance', 'exponential'],
'learning_rate': [0.1, 0.2, 0.3],
'n_estimators': [100, 200],
'subsample': [0.5, 1.0]}
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(gbc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, gbc
This model optimizes the log-loss function using LBFGS or stochastic gradient descent. See sklearn.neural_network.MLPClassifier.
# Parameters
param_grid = {'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': [10.0**x for x in np.arange(-1,-4,-1)],
'learning_rate' : ['constant', 'invscaling', 'adaptive']}
# Multi-layer Perceptron classifier
mlp = MLPClassifier(max_iter = 1000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(mlp, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
it seems that Gradient Boosting Classifier performing slightly better than the rest of the classification method in this study. All of these classification methods are tuned in a way that performs at their best by implementing GridSearchCV.